df <- read.csv("/Users/austinra/BIOS6624/Project 0/Data Raw/Project0_Clean_v2.csv")

EDA:

glimpse(df)
## Rows: 372
## Columns: 15
## $ SubjectID                                    <int> 3012, 3012, 3012, 3012, 3…
## $ Collection.Date                              <chr> "10/2/2018", "10/2/2018",…
## $ Collection.Sample                            <int> 1, 2, 3, 4, 1, 2, 3, 4, 1…
## $ Booket..Clock.Time                           <chr> "8:54", "9:38", "12:31", …
## $ MEMs..Clock.Time                             <chr> "8:55", "9:38", "12:30", …
## $ Sleep.Diary.reported.wake.time               <chr> "8:54", "", "", "", "7:20…
## $ Booklet..Sample.interval                     <chr> "", "0:44:00", "3:37:00",…
## $ Booklet..Sample.interval.Decimal.Time..mins. <int> 0, 44, 217, 644, 0, 37, 3…
## $ MEMs..Sample.interval                        <chr> "", "0:43:00", "3:35:00",…
## $ MEMs..Sample.interval.Decimal.Time..mins.    <int> NA, 43, 215, 643, NA, 36,…
## $ Cortisol..ug.dl.                             <dbl> 0.142, 0.259, 0.084, 0.02…
## $ DHEA..pg.dl.                                 <dbl> 449.743, 152.798, 94.410,…
## $ Cortisol..nmol.L.                            <dbl> 3.91778, 7.14581, 2.31756…
## $ DHEA..nmol.L.                                <dbl> 1.5606082, 0.5302091, 0.3…
## $ DAYNUMB                                      <int> 1, 1, 1, 1, 2, 2, 2, 2, 3…
#Examine which values are blank and which are NA
df %>%
  summarise(
    across(
      everything(),
      list(
        blank = ~ sum(str_trim(as.character(.x)) == "", na.rm = TRUE),
        na = ~ sum(is.na(.x))
      )
    )
  )
##   SubjectID_blank SubjectID_na Collection.Date_blank Collection.Date_na
## 1               0            0                     0                  0
##   Collection.Sample_blank Collection.Sample_na Booket..Clock.Time_blank
## 1                       0                    0                       35
##   Booket..Clock.Time_na MEMs..Clock.Time_blank MEMs..Clock.Time_na
## 1                     0                     61                   0
##   Sleep.Diary.reported.wake.time_blank Sleep.Diary.reported.wake.time_na
## 1                                  279                                 0
##   Booklet..Sample.interval_blank Booklet..Sample.interval_na
## 1                            130                           0
##   Booklet..Sample.interval.Decimal.Time..mins._blank
## 1                                                  0
##   Booklet..Sample.interval.Decimal.Time..mins._na MEMs..Sample.interval_blank
## 1                                              55                         177
##   MEMs..Sample.interval_na MEMs..Sample.interval.Decimal.Time..mins._blank
## 1                        0                                               0
##   MEMs..Sample.interval.Decimal.Time..mins._na Cortisol..ug.dl._blank
## 1                                          177                      0
##   Cortisol..ug.dl._na DHEA..pg.dl._blank DHEA..pg.dl._na
## 1                   0                  0               5
##   Cortisol..nmol.L._blank Cortisol..nmol.L._na DHEA..nmol.L._blank
## 1                       0                    5                   0
##   DHEA..nmol.L._na DAYNUMB_blank DAYNUMB_na
## 1                5             0          0
#35 blank Booket..Clock.Time
#61 blank MEMs..Clock.Time
#279 blank Sleep.Diary.reported.wake.time
#5 Cortisol..nmol.L._na, 5 DHEA..nmol.L._na

Check each SubjectID has proper number of samples

problem_check <- df %>% group_by(SubjectID, DAYNUMB) %>% summarize(n = n()) 
## `summarise()` has grouped output by 'SubjectID'. You can override using the
## `.groups` argument.
#Subject 3029 has 12 observations on day 3, assign new DAYNUMB based on date for this subject
df_mod <- df %>%
  mutate(Collection.Date = as.Date(Collection.Date, format = "%m/%d/%Y")) %>%
  mutate(
    DAYNUMB = case_when(
      SubjectID == 3029 & Collection.Date == "2018-10-07" ~ 1,
      SubjectID == 3029 & Collection.Date == "2018-10-10" ~ 2,
      SubjectID == 3029 & Collection.Date == "2018-10-12" ~ 3,
      TRUE ~ DAYNUMB
    )
  ) %>% ungroup()

Modify variable types

#SubjectID needs to be Categorical
df_mod$SubjectID <- as.character(df_mod$SubjectID)
#Collection.Sample should be Categorical
df_mod$Collection.Sample <- as.character(df_mod$Collection.Sample)
#Booket..Clock.Time should be Time
df_mod$Booket..Clock.Time <-hm(df_mod$Booket..Clock.Time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#MEMs..Clock.Time should be Time
df_mod$MEMs..Clock.Time <- hm(df_mod$MEMs..Clock.Time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#Sleep.Diary.reported.wake.time should be Time 
df_mod$Sleep.Diary.reported.wake.time <- hm(df_mod$Sleep.Diary.reported.wake.time)
## Warning in .parse_hms(..., order = "HM", quiet = quiet): Some strings failed to
## parse
#Sleep.Diary.reported.wake.time carried down for each SubjectID DAYNUMB combination
df_mod <- df_mod %>%
  arrange(
    SubjectID,
    DAYNUMB,
    suppressWarnings(as.integer(Collection.Sample))
  ) %>%
  group_by(SubjectID, DAYNUMB) %>%
  tidyr::fill(Sleep.Diary.reported.wake.time, .direction = "down") %>%
  ungroup()

#Sample Intervals should not be used in the analysis - calculate own values #Booklet..Sample.interval is the Number of hours and minutes from waking recorded by participant formatted as hh:mm where waking was recorded as midnight 00:00 #Booklet..Sample.interval.Decimal.Time..mins. integer representation of previous variable aka number of minutes from waking for the sample as recorded by participant. # MEMs..Sample.interval Number of hours and minutes from waking electronic cap stamp formatted as hh:mm where waking was recorded as midnight 00:00 #MEMs..Sample.interval.Decimal.Time..mins. integer representation of previous variable aka number of minutes from waking electronic stamp.

# Convert times to minutes since midnight
df_mod$booklet_mins <- lubridate::hour(df_mod$Booket..Clock.Time) * 60 +
                          lubridate::minute(df_mod$Booket..Clock.Time)

df_mod$cap_mins <- lubridate::hour(df_mod$MEMs..Clock.Time) * 60 +
                      lubridate::minute(df_mod$MEMs..Clock.Time)

df_mod$wake_mins <- lubridate::hour(df_mod$Sleep.Diary.reported.wake.time) * 60 +
                       lubridate::minute(df_mod$Sleep.Diary.reported.wake.time)


#Creating new variables capturing elapsed time between booklet and wake time AND cap and wake time respectively
df_mod$cap_interval_mins <-
  df_mod$cap_mins - df_mod$wake_mins

df_mod$booklet_interval_mins <-
  df_mod$booklet_mins - df_mod$wake_mins
glimpse(df_mod)
## Rows: 372
## Columns: 20
## $ SubjectID                                    <chr> "3012", "3012", "3012", "…
## $ Collection.Date                              <date> 2018-10-02, 2018-10-02, …
## $ Collection.Sample                            <chr> "1", "2", "3", "4", "1", …
## $ Booket..Clock.Time                           <Period> 8H 54M 0S, 9H 38M 0S, …
## $ MEMs..Clock.Time                             <Period> 8H 55M 0S, 9H 38M 0S, …
## $ Sleep.Diary.reported.wake.time               <Period> 8H 54M 0S, 8H 54M 0S, …
## $ Booklet..Sample.interval                     <chr> "", "0:44:00", "3:37:00",…
## $ Booklet..Sample.interval.Decimal.Time..mins. <int> 0, 44, 217, 644, 0, 37, 3…
## $ MEMs..Sample.interval                        <chr> "", "0:43:00", "3:35:00",…
## $ MEMs..Sample.interval.Decimal.Time..mins.    <int> NA, 43, 215, 643, NA, 36,…
## $ Cortisol..ug.dl.                             <dbl> 0.142, 0.259, 0.084, 0.02…
## $ DHEA..pg.dl.                                 <dbl> 449.743, 152.798, 94.410,…
## $ Cortisol..nmol.L.                            <dbl> 3.91778, 7.14581, 2.31756…
## $ DHEA..nmol.L.                                <dbl> 1.5606082, 0.5302091, 0.3…
## $ DAYNUMB                                      <dbl> 1, 1, 1, 1, 2, 2, 2, 2, 3…
## $ booklet_mins                                 <dbl> 534, 578, 751, 1178, 440,…
## $ cap_mins                                     <dbl> 535, 578, 750, 1178, 441,…
## $ wake_mins                                    <dbl> 534, 534, 534, 534, 440, …
## $ cap_interval_mins                            <dbl> 1, 44, 216, 644, 1, 37, 3…
## $ booklet_interval_mins                        <dbl> 0, 44, 217, 644, 0, 37, 3…

Save processed data

#write.xlsx(df_mod, "/Users/austinra/BIOS6624/Project 0/Data Processed/Project0_ProcessedData.xlsx")

Table One

library(dplyr)
library(tidyr)
library(tibble)
library(knitr)
library(kableExtra)

#ChatGPT created function to handle sprintf errors
mean_sd <- function(x, digits = 2) {
  x <- x[!is.na(x)]
  if (length(x) == 0) return(NA_character_)
  if (length(x) == 1) return(sprintf(paste0("%.", digits, "f (NA)"), x))
  sprintf(
    paste0("%.", digits, "f (%.", digits, "f)"),
    mean(x), sd(x)
  )
}

# Build sample-wise summary
by_sample <- df_mod %>%
  group_by(`Collection.Sample`) %>%
  summarise(
    observations = n(),

    cortisol_missing = sum(is.na(`Cortisol..nmol.L.`)),
    cortisol = mean_sd(`Cortisol..nmol.L.`),

    dhea_missing = sum(is.na(`DHEA..nmol.L.`)),
    dhea = mean_sd(`DHEA..nmol.L.`),

    booklet_interval_missing = sum(is.na(booklet_interval_mins)),
    booklet_interval = mean_sd(booklet_interval_mins),

    cap_interval_missing = sum(is.na(cap_interval_mins)),
    cap_interval = mean_sd(cap_interval_mins),

    .groups = "drop"
  )

# Build overall summary
overall <- df_mod %>%
  summarise(
    Collection.Sample = "Overall",
    observations = n(),

    cortisol_missing = sum(is.na(`Cortisol..nmol.L.`)),
    cortisol = mean_sd(`Cortisol..nmol.L.`),

    dhea_missing = sum(is.na(`DHEA..nmol.L.`)),
    dhea = mean_sd(`DHEA..nmol.L.`),

    booklet_interval_missing = sum(is.na(booklet_interval_mins)),
    booklet_interval = mean_sd(booklet_interval_mins),

    cap_interval_missing = sum(is.na(cap_interval_mins)),
    cap_interval = mean_sd(cap_interval_mins)
  )

final_tbl <- bind_rows(by_sample, overall)

# Rotate to long format 
table1 <- final_tbl %>%
  mutate(across(-`Collection.Sample`, as.character)) %>%
  pivot_longer(-`Collection.Sample`, names_to = "Measure", values_to = "Value") %>%
  pivot_wider(names_from = `Collection.Sample`, values_from = Value) %>%
  mutate(
    Measure = recode(Measure,
      observations = "Observations (N)",
      cortisol_missing = "Missing Cortisol Levels",
      cortisol = "Cortisol (nmol/L)",
      dhea_missing = "Missing DHEA values",
      dhea = "DHEA (nmol/L)",
      booklet_interval_missing = "Missing booklet sample time",
      booklet_interval = "Booklet sample time after waking (minutes)",
      cap_interval_missing = "Missing electronic cap sample time",
      cap_interval = "Cap sample time after waking (minutes)"
    )
  )

#Rename 
table1_pretty <- table1 %>%
  mutate(
    Measure = recode(Measure,
      n_subjects = "Subjects (N)",
      cortisol_missing = "Missing Cortisol Levels",
      cortisol = "Cortisol (nmol/L)",
      dhea_missing = "Missing DHEA values",
      dhea = "DHEA (nmol/L)",
      booklet_interval_missing = "Missing booklet sample time",
      booklet_interval = "Booklet sample time after waking (minutes)",
      cap_interval_missing = "Missing electronic cap sample time",
      cap_interval = "Cap sample time after waking (minutes)"
    )
  )

align_vec <- paste0("l", paste(rep("c", ncol(table1_pretty) - 1), collapse = ""))

table1_pretty %>%
  kbl(
    caption = "Figure 1: Descriptive statistics",
    booktabs = TRUE,
    align = align_vec
  ) %>%
  kable_styling(full_width = FALSE, position = "center") %>%
  row_spec(0, bold = TRUE) %>%footnote(
  general = c(
    "Values are presented as mean (SD) unless otherwise noted.",
    "Collection Sample: 1 = waking; 2 = 30 minutes after waking; 3 = lunch; 4 = 600 minutes after waking."
  ),
  general_title = "Note: "
)
Figure 1: Descriptive statistics
Measure 1 2 3 4 Overall
Observations (N) 93 93 93 93 372
Missing Cortisol Levels 0 1 2 2 5
Cortisol (nmol/L) 7.38 (6.05) 9.00 (4.94) 3.21 (2.29) 4.15 (10.32) 5.95 (6.95)
Missing DHEA values 0 1 2 2 5
DHEA (nmol/L) 1.78 (1.27) 1.11 (0.92) 0.51 (0.46) 0.50 (0.65) 0.98 (1.03)
Missing booklet sample time 5 6 11 13 35
Booklet sample time after waking (minutes) 0.90 (3.71) 34.40 (9.55) 322.85 (84.87) 641.50 (84.19) 239.96 (263.45)
Missing electronic cap sample time 16 23 12 10 61
Cap sample time after waking (minutes) 5.12 (9.20) 50.11 (34.97) 323.41 (79.63) 658.86 (99.07) 272.61 (272.20)
Note:
Values are presented as mean (SD) unless otherwise noted.
Collection Sample: 1 = waking; 2 = 30 minutes after waking; 3 = lunch; 4 = 600 minutes after waking.

Question 1: What is the agreement between the subject’s recordings of sampling times compared to the times recorded by an electric monitoring cap?

Record rates of missingness between the caplet time and booklet time

# Cap time distribution
ggplot(df_mod %>% filter(!is.na(cap_interval_mins)),
       aes(cap_interval_mins)) +
  geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
  theme_minimal(base_size = 14) +
  labs(
    x = "Cap minutes since wake",
    y = "Number of samples",
    title = "Cap Sampling Interval Distribution"
  )

#There are cap times with negative values, let's check how many
negative_captime <- df_mod %>% filter(!is.na(cap_interval_mins) & cap_interval_mins < 0)
table(negative_captime$cap_interval_mins)
## 
## -11  -9  -6  -5  -4  -3  -2  -1 
##   1   1   1   1   1   1   3   5
#14 cap times with negative values, this most likely means the participant recorded a wake time in their booklet later than they actually woke up

na_captime <- df_mod%>% filter(is.na(cap_interval_mins))
table(na_captime$cap_interval_mins, useNA = "ifany")
## 
## <NA> 
##   61
#61 missing cap_interval mins out of 366 observations
# Book time distribution
ggplot(df_mod %>% filter(!is.na(booklet_interval_mins)),
       aes(booklet_interval_mins)) +
  geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
  theme_minimal(base_size = 14) +
  labs(
    x = "Booklet minutes since wake",
    y = "Number of samples",
    title = "Booklet Sampling Interval Distribution"
  )

#2 book times with negative values
negative_booklettime <- df_mod %>% filter(!is.na(booklet_interval_mins) & booklet_interval_mins < 0)
table(negative_booklettime$booklet_interval_mins, useNA = "ifany")
## 
## -12  -2 
##   1   1
na_booklettime <- df_mod%>% filter(is.na(booklet_interval_mins))
table(na_booklettime$booklet_interval_mins, useNA = "ifany")
## 
## <NA> 
##   35
#35 missing booklet_interval_mins out of 366 observations

Removing observations with missing cap_interval_mins or missing booklet_interval_mins from df_mod for this part of the analysis

#Drop all observations where either of these values are missing
df_q1 <- filter(df_mod, !is.na(cap_interval_mins) & !is.na(booklet_interval_mins))
#Scatter plot of cap time (minutes) from waking up and booklet time (minutes) since waking up (continuous)
ggplot(df_q1, aes(x = cap_interval_mins, y = booklet_interval_mins)) +
  geom_point(alpha = 0.6) +
  labs(
    x = "Cap interval (minutes since wake)",
    y = "Booklet interval (minutes since wake)",
    title = "Cap vs Booklet Minutes from Waking"
  ) +
  geom_abline(slope = 1, intercept = 0, color = "blue") +
  theme_minimal() +
  coord_equal()

Find the difference between recorded sampling time and recorded cap time and print summary statistics

df_q1$booklet_cap_difference <- df_q1$booklet_interval_mins - df_q1$cap_interval_mins
summary(df_q1$booklet_cap_difference)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -200.000   -7.000   -1.000   -7.712    1.000  133.000

We have no NA values, as expected, and booklet time is often recorded earlier than cap time.

Plot distribution of the difference between booklet sampling time and cap sampling time

ggplot(df_q1,
       aes(booklet_cap_difference)) +
  geom_histogram(binwidth = 10, fill = "steelblue", alpha = 0.8) +
  geom_text(
    stat = "bin",
    binwidth = 10,
    aes(label = after_stat(count)),
    vjust = -0.3,
    size = 3
  ) +
  theme_minimal(base_size = 14) +
  labs(
    x = "Booklet interval minus cap interval",
    y = "Number of samples",
    title = "Difference in Sampling Interval Distribution"
  )

Booklet and Cap Time fall pretty close together with some extreme outliers representing large differences between recorded booklet time and cap time.

Comparing booklet and electric monitoring cap using a mixed model:

q1.lmm <- lmer(booklet_interval_mins ~ cap_interval_mins + (1|SubjectID), 
               data = df_q1)
summary(q1.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: booklet_interval_mins ~ cap_interval_mins + (1 | SubjectID)
##    Data: df_q1
## 
## REML criterion at convergence: 2787.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.9420  0.0024  0.1588  0.2635  4.3866 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SubjectID (Intercept)  44.11    6.642  
##  Residual              989.60   31.458  
## Number of obs: 285, groups:  SubjectID, 31
## 
## Fixed effects:
##                    Estimate Std. Error        df t value Pr(>|t|)    
## (Intercept)        -6.50779    2.91608  66.29918  -2.232    0.029 *  
## cap_interval_mins   0.99494    0.00706 262.91666 140.931   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## cp_ntrvl_mn -0.641
confint(q1.lmm)
## Computing profile confidence intervals ...
##                         2.5 %     97.5 %
## .sig01              0.0000000 12.2740368
## .sigma             28.8548052 34.3503012
## (Intercept)       -12.2587408 -0.7985482
## cap_interval_mins   0.9810298  1.0087634

For every 1 minute increase in recorded cap time, recorded booklet time increases by 0.995 minutes (95% CI: 0.981, 1.008, p-value: <0.0001). This suggests there is strong agreement between recorded cap time and recorded booklet time. The intercept suggests there is bias though, subjects recorded sample collection in their booklet about 6.508 (95% CI: -12.259, -0.799, p-value: 0.029) minutes earlier than recorded sample collection by the electronic cap.

Visualizing fit

plot(q1.lmm)

qqnorm(residuals(q1.lmm))
qqline(residuals(q1.lmm))

The residuals are fairly close to 0 for most of the fitted values but there are a few observations with very large negative residuals. The qqplot shows the model is doing a good job fitting the majority of the data but there is strong deviation in the tails with some large negative and positive values.

Visualizing fitted vs observed values with identity line and fitted line

pred_q1 <- tibble(
  cap_interval_mins = seq(min(df_q1$cap_interval_mins, na.rm = TRUE),
                          max(df_q1$cap_interval_mins, na.rm = TRUE),
                          length.out = 200)
)

pred_q1$booklet_pred <- predict(q1.lmm, newdata = pred_q1, re.form = NA)

ggplot(df_q1, aes(x = cap_interval_mins, y = booklet_interval_mins)) +
  geom_point(alpha = 0.6) +
  geom_line(
    data = pred_q1,
    aes(x = cap_interval_mins, y = booklet_pred),
    inherit.aes = FALSE,
    color = "red"
  ) +
  labs(
    x = "Electronic interval (minutes since wake)",
    y = "Booklet interval (minutes since wake)",
    title = "Electronic Cap vs Booklet Minutes from Waking"
  ) +
  geom_abline(slope = 1, intercept = 0, color = "blue") +
  theme_minimal() +
  coord_equal()

Question 2: Are subjects adhering accurately to the +30 min and +10 hour sampling times required by a protocol?

Adherence window 30 minutes (Collection 2) and 10 hours (Collection 4) after waking One analysis for good adherence = 7.5 minutes before or after (inclusive) - proportion of those who met this One analysis for adequate adherence = 15 minutes before or after (inclusive) - proportion of those who met this #Create collection time var to grab samples we’re interested in and booklet and cap deviation times from those collection windows

df_q2 <- filter(df_mod, Collection.Sample %in% c("2","4"))

df_q2 <- df_q2 %>%
  mutate(
    cap_good = case_when(
      Collection.Sample == "2" ~ between(cap_interval_mins, 22.5, 37.5),
      Collection.Sample == "4" ~ between(cap_interval_mins, 592.5, 607.5),
      TRUE ~ NA
    ),

    cap_adequate = case_when(
      Collection.Sample == "2" ~ between(cap_interval_mins, 15, 45),
      Collection.Sample == "4" ~ between(cap_interval_mins, 585, 615),
      TRUE ~ NA
    ),
    
    booklet_good = case_when(
      Collection.Sample == "2" ~ between(booklet_interval_mins, 22.5, 37.5),
      Collection.Sample == "4" ~ between(booklet_interval_mins, 592.5, 607.5),
      TRUE ~ NA
    ),

    booklet_adequate = case_when(
      Collection.Sample == "2" ~ between(booklet_interval_mins, 15, 45),
      Collection.Sample == "4" ~ between(booklet_interval_mins, 585, 615),
      TRUE ~ NA
    )
  )

Tracking adherence

adherence_summary <- df_q2 %>%
  group_by(Collection.Sample) %>%
  summarise(
    N = n(),

    #cap
    cap_good_n = sum(cap_good == TRUE, na.rm = TRUE),
    cap_adequate_n = sum(cap_adequate == TRUE, na.rm = TRUE),
    cap_missing_n = sum(is.na(cap_interval_mins)),

    #booklet
    booklet_good_n = sum(booklet_good == TRUE, na.rm = TRUE),
    booklet_adequate_n = sum(booklet_adequate == TRUE, na.rm = TRUE),
    booklet_missing_n = sum(is.na(booklet_interval_mins)),

    .groups = "drop"
  ) %>%
  mutate(
    cap_good_pct = round(100 * cap_good_n / (N - cap_missing_n), 1),
    cap_adequate_pct = round(100 * cap_adequate_n / (N - cap_missing_n), 1),

    booklet_good_pct = round(100 * booklet_good_n / (N - booklet_missing_n), 1),
    booklet_adequate_pct = round(100 * booklet_adequate_n / (N - booklet_missing_n), 1)
  )

#Create a tidy table for report/presentation
adherence_table <- adherence_summary %>%
  mutate(Collection.Sample = recode(Collection.Sample,
                                    "2" = "30 min",
                                    "4" = "10 hr")) %>%
  rename(
    `Sample Time` = Collection.Sample,
    `N` = N,
    `Cap Missing (n)` = cap_missing_n,
    `Booklet Missing (n)` = booklet_missing_n,
    `Cap Sample Good Adherence (%)` = cap_good_pct,
    `Cap Sample Adequate Adherence (%)` = cap_adequate_pct,
    `Booklet Sample Good Adherence (%)` = booklet_good_pct,
    `Booklet Sample Adequate Adherence (%)` = booklet_adequate_pct
  ) %>%
  dplyr::select(
    `Sample Time`,
    `N`,
    `Cap Missing (n)`,
    `Cap Sample Good Adherence (%)`,
    `Cap Sample Adequate Adherence (%)`,
    `Booklet Missing (n)`,
    `Booklet Sample Good Adherence (%)`,
    `Booklet Sample Adequate Adherence (%)`
  )

kable(adherence_table, caption = "Cap and Booklet Adherence by Collection Sample")
Cap and Booklet Adherence by Collection Sample
Sample Time N Cap Missing (n) Cap Sample Good Adherence (%) Cap Sample Adequate Adherence (%) Booklet Missing (n) Booklet Sample Good Adherence (%) Booklet Sample Adequate Adherence (%)
30 min 93 23 52.9 71.4 6 78.2 89.7
10 hr 93 10 32.5 39.8 13 46.2 56.2

Calculate deviation (minutes) by sample for visualization

df_q2 <- df_q2 %>%
  mutate(
    target_mins = case_when(
      Collection.Sample == "2" ~ 30,
      Collection.Sample == "4" ~ 60*10,
      TRUE ~ NA_real_
    ),
    cap_dev_mins = cap_interval_mins - target_mins,
    booklet_dev_mins = booklet_interval_mins - target_mins
  )

Cap Adherence Deviation Visualization

ggplot(df_q2 %>% filter(!is.na(cap_dev_mins)),
       aes(x = cap_dev_mins)) +
  geom_histogram(binwidth = 10, fill = "steelblue") +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = c(-7.5, 7.5), linetype = "dashed") +
  geom_vline(xintercept = c(-15, 15), linetype = "dotted") +
  theme_minimal() +
  labs(
    x = "Cap deviation from target (minutes)",
    y = "Count",
    title = "Cap Sampling Deviations"
  )

Booklet Adherence Deviation

ggplot(df_q2 %>% filter(!is.na(booklet_dev_mins)),
       aes(x = booklet_dev_mins)) +
  geom_histogram(binwidth = 10, fill = "steelblue") +
  geom_vline(xintercept = 0) +
  geom_vline(xintercept = c(-7.5, 7.5), linetype = "dashed") +
  geom_vline(xintercept = c(-15, 15), linetype = "dotted") +
  theme_minimal() +
  labs(
    x = "Booklet deviation from target (minutes)",
    y = "Count",
    title = "Booklet Sampling Deviations"
  )

Let’s see whether cap deviations vary by sample:

ggplot(df_q2 %>% filter(!is.na(cap_dev_mins)),
       aes(factor(Collection.Sample), cap_dev_mins)) +
  geom_boxplot() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Sample",
    y = "Cap deviation (minutes)",
    title = "Cap Deviations by Sample"
  )

Let’s see whether bookletdeviations vary by sample:

ggplot(df_q2 %>% filter(!is.na(booklet_dev_mins)),
       aes(factor(Collection.Sample), booklet_dev_mins)) +
  geom_boxplot() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Sample",
    y = "Booklet deviation (minutes)",
    title = "Booklet Deviations by Sample"
  )

Visualize cap and booklet times together

dfq2_long <- df_q2 %>%
  pivot_longer(
    cols = c(cap_dev_mins, booklet_dev_mins),
    names_to = "method",
    values_to = "dev_mins"
  ) %>%
  filter(!is.na(dev_mins)) %>%  
  mutate(
    method = recode(method,
                    cap_dev_mins = "Cap",
                    booklet_dev_mins = "Booklet"),
    Collection.Sample = factor(Collection.Sample)
  )

ggplot(dfq2_long, aes(Collection.Sample, dev_mins, fill = method)) +
  geom_boxplot(position = position_dodge(width = 0.75)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  theme_minimal(base_size = 14) +
  labs(
    x = "Sample",
    y = "Deviation (minutes)",
    title = "Cap and Booklet Deviations by Sample",
    fill = "Method"
  )

Question 3: What are the patterns of change over the course of the day? What is the investigator interested in knowing about changes over time? The investigator stated that levels of cortisol are expected to increase sharply from waking to 30 minutes after waking and then decline more slowly over the rest of the day.

She was interested in knowing if the SPIT test shows this expected pattern.

She wanted to know: 1) is there an increase in cortisol and/or DHEA from waking to 30 minutes post waking? And 2) what is the rate of decline in cortisol and DHEA after 30 minutes from waking?

She also stated that even if there are not statistically significant differences, she would like to know the estimates for the initial increase and rates of decline so she can compare them to what is known from other more standard hormone tests.

Biological Range of Data Cortisol > 26 keep but weird, Cortisol > 80 incorrect - probably a laboratory error DHEA upper limit on assay is 5.205 (max value in dataset - exclude these) - probably an error unless this is recorded repeatedly by the same individual (probably exclude these individuals bc they have a different process going on)

Outliers: Check no cortisol > 80, check no DHEA >5.205

#One outlier with cortisol value > 80 - remove this
df_q3 <- filter(df_mod, Cortisol..nmol.L. < 80)

#DHEA investigation of upper limit lab values
x <- filter(df_q3,DHEA..nmol.L. == 5.205)

#SubjectID 3037 needs to be be removed as it is at the upper limit multiple times, suggesting something is wrong with this patient
df_q3 <- filter(df_q3, SubjectID != 3037)

Visualization of Subject 3024 strange behavior - overall this is fine, we’ll leave it in the analysis

ggplot((df_q3 %>% filter(SubjectID == 3024)), aes(x = cap_interval_mins, y = DHEA..nmol.L.)) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "DHEA (nmol/L)",
    title = "DHEA Over Time Since Waking"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggplot((df_q3 %>% filter(SubjectID == 3024)), aes(x = cap_interval_mins, y = Cortisol..nmol.L.)) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "Cortisol (nmol/L)",
    title = "Cortisol Over Time Since Waking"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

#Visualize cortisol
hist(df_q3$Cortisol..nmol.L., main = "Histogram of Cortisol (nmol/L)", xlab = "Cortisol (nmol/L)")

#Heavy right skew - let's look at a log transformation
hist(log(df_q3$Cortisol..nmol.L.), main = "Histogram of Cortisol (nmol/L)", xlab = "Cortisol (nmol/L)")

#Visualizing DHEA
hist(df_q3$DHEA..nmol.L., main = "Histogram of DHEA (nmol/L)", xlab = "DHEA (nmol/L)")

#Again heavy right skew - let's look at the log transform
hist(log(df_q3$DHEA..nmol.L.),  main = "Histogram of DHEA (nmol/L)", xlab = "DHEA (nmol/L)")

summary(df_q3$DHEA..nmol.L.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1076  0.2991  0.5784  0.9224  1.2397  5.2050
summary(df_q3$Cortisol..nmol.L.)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5242  2.0486  4.1109  5.6825  7.6217 37.6879

Significant outlier of 37, leaving it in the analysis but noting it’s strange

ggplot(df_q3, aes(x = cap_interval_mins, y = Cortisol..nmol.L., group = SubjectID, color = factor(SubjectID))) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "Cortisol (nmol/L)",
    title = "Cortisol Over Time Since Waking"
  )
## Warning: Removed 56 rows containing missing values or values outside the scale range
## (`geom_line()`).

Zoom in to verify peak cortisol is where we expect

ggplot(df_q3, aes(x = cap_interval_mins, y = Cortisol..nmol.L., group = SubjectID, color = factor(SubjectID))) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "Cortisol (nmol/L)",
    title = "Cortisol Over Time Since Waking"
  )+
  coord_cartesian(xlim = c(0, 120))
## Warning: Removed 56 rows containing missing values or values outside the scale range
## (`geom_line()`).

ggplot(df_q3, aes(x = cap_interval_mins, y = DHEA..nmol.L., group = SubjectID, color = factor(SubjectID))) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "DHEA (nmol/L)",
    title = "DHEA Over Time Since Waking"
  )
## Warning: Removed 56 rows containing missing values or values outside the scale range
## (`geom_line()`).

Let’s see if DHEA even needs a changepoint like cortisol

ggplot(df_q3, aes(x = cap_interval_mins, y = DHEA..nmol.L., group = SubjectID, color = factor(SubjectID))) +
  geom_line() +
  theme_minimal(base_size = 14) +
  labs(
    x = "Minutes since waking (Book)",
    y = "DHEA (nmol/L)",
    title = "DHEA Over Time Since Waking"
  ) +
  coord_cartesian(xlim = c(0, 120))
## Warning: Removed 56 rows containing missing values or values outside the scale range
## (`geom_line()`).

Changepoint at 30 might not make sense let’s investigate further

dhea_peak_bysubj <- df_q3 %>%
  filter(!is.na(DHEA..nmol.L.), !is.na(booklet_interval_mins)) %>%
  group_by(SubjectID) %>%
  slice_max(DHEA..nmol.L., with_ties = FALSE) %>%
  summarise(
    peak_minute = booklet_interval_mins,
    peak_dhea = DHEA..nmol.L.,
    .groups = "drop"
  )
dhea_peak_by_subj_day <- df_q3 %>%
  filter(!is.na(DHEA..nmol.L.), !is.na(booklet_interval_mins), !is.na(DAYNUMB)) %>%
  group_by(SubjectID, DAYNUMB) %>%
  arrange(desc(DHEA..nmol.L.), booklet_interval_mins) %>%  # highest DHEA, then earliest time
  slice(1) %>%
  ungroup() %>%
  transmute(
    SubjectID,
    DAYNUMB,
    peak_minute = booklet_interval_mins,
    peak_dhea   = DHEA..nmol.L.
  )


hist(dhea_peak_bysubj$peak_minute, main = "Histogram of Peak DHEA Timing (Booklet Minutes since Waking)")

hist(dhea_peak_by_subj_day$peak_minute, main = "Histogram of Peak DHEA Timing (Booklet Minutes since Waking)")

Create variable for 30 minute knot and let’s proceed with logging DHEA and Cortisol

#Let's verify knot - 34 mins is our mean but the literature suggests 30 minutes so let's stick with that
mean(
  df_q3 %>%
    filter(Collection.Sample == "2") %>%
    pull(booklet_interval_mins),
  na.rm = TRUE
)
## [1] 34.28571
#Modified from BIOS 6643 code- changepoint creation
df_q3 <- df_q3 %>%
  mutate(
    wake_30 = pmin(booklet_interval_mins, 30),
    after_30 = pmax(booklet_interval_mins - 30, 0),
    log_cortisol = log(Cortisol..nmol.L.),
    log_dhea = log(DHEA..nmol.L.)
  )

Piecewise linear regression (lmm w random slope for subject) for Cortisol

q3_cortisol.lmm <- lmer(
  log_cortisol ~ wake_30 + after_30 + (1 | SubjectID),
  data = df_q3
)

summary(q3_cortisol.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log_cortisol ~ wake_30 + after_30 + (1 | SubjectID)
##    Data: df_q3
## 
## REML criterion at convergence: 775.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1531 -0.5068  0.0402  0.5695  3.9510 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SubjectID (Intercept) 0.1147   0.3387  
##  Residual              0.5352   0.7316  
## Number of obs: 323, groups:  SubjectID, 30
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  1.684e+00  1.024e-01  9.086e+01  16.438   <2e-16 ***
## wake_30      6.134e-03  3.647e-03  2.912e+02   1.682   0.0936 .  
## after_30    -2.231e-03  1.851e-04  2.944e+02 -12.052   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) wak_30
## wake_30  -0.594       
## after_30  0.003 -0.503
cortisol_table <- tidy(q3_cortisol.lmm, effects = "fixed", conf.int = TRUE)
cortisol_table <- cortisol_table %>%
  mutate(
    estimate_exp = exp(estimate),
    conf.low_exp = exp(conf.low),
    conf.high_exp = exp(conf.high)
  )

cortisol_table_exp <- cortisol_table %>%
  dplyr::select(term, estimate_exp, conf.low_exp, conf.high_exp, p.value) %>%
  rename(
    Term = term,
    `Rate Ratio` = estimate_exp,
    `95% CI Lower` = conf.low_exp,
    `95% CI Upper` = conf.high_exp,
    `p-value` = p.value
  )
kable(cortisol_table_exp, digits = 3,
      caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates")
Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates
Term Rate Ratio 95% CI Lower 95% CI Upper p-value
(Intercept) 5.387 4.395 6.603 0.000
wake_30 1.006 0.999 1.013 0.094
after_30 0.998 0.997 0.998 0.000
kable(cortisol_table_exp, digits = 3,
      caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates")
Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates
Term Rate Ratio 95% CI Lower 95% CI Upper p-value
(Intercept) 5.387 4.395 6.603 0.000
wake_30 1.006 0.999 1.013 0.094
after_30 0.998 0.997 0.998 0.000

Linear regression (lmm w random slope for subject) for DHEA

q3_dhea.lmm <- lmer(
  log_dhea ~ booklet_interval_mins + (1 | SubjectID),
  data = df_q3
)

summary(q3_dhea.lmm)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: log_dhea ~ booklet_interval_mins + (1 | SubjectID)
##    Data: df_q3
## 
## REML criterion at convergence: 703.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1476 -0.5581 -0.0167  0.6450  2.5377 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  SubjectID (Intercept) 0.2625   0.5123  
##  Residual              0.4061   0.6372  
## Number of obs: 323, groups:  SubjectID, 30
## 
## Fixed effects:
##                         Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)           -2.856e-02  1.059e-01  3.550e+01   -0.27    0.789    
## booklet_interval_mins -2.000e-03  1.362e-04  2.955e+02  -14.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## bklt_ntrvl_ -0.314

Get results in an easy to read format

dhea_table <- tidy(q3_dhea.lmm, effects = "fixed", conf.int = TRUE)
dhea_table <- dhea_table %>%
  mutate(
    estimate_exp = exp(estimate),
    conf.low_exp = exp(conf.low),
    conf.high_exp = exp(conf.high)
  )

dhea_table_exp <- dhea_table %>%
  dplyr::select(term, estimate_exp, conf.low_exp, conf.high_exp, p.value) %>%
  rename(
    Term = term,
    `Rate Ratio` = estimate_exp,
    `95% CI Lower` = conf.low_exp,
    `95% CI Upper` = conf.high_exp,
    `p-value` = p.value
  )

kable(dhea_table, digits = 3,
      caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates")
Linear Mixed Model of Log DHEA on Minutes Since Waking - Raw Estimates
effect term estimate std.error statistic df p.value conf.low conf.high estimate_exp conf.low_exp conf.high_exp
fixed (Intercept) -0.029 0.106 -0.27 35.505 0.789 -0.243 0.186 0.972 0.784 1.205
fixed booklet_interval_mins -0.002 0.000 -14.69 295.508 0.000 -0.002 -0.002 0.998 0.998 0.998
kable(dhea_table_exp, digits = 3,
      caption = "Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates")
Linear Mixed Model of Log DHEA on Minutes Since Waking - Transformed Estimates
Term Rate Ratio 95% CI Lower 95% CI Upper p-value
(Intercept) 0.972 0.784 1.205 0.789
booklet_interval_mins 0.998 0.998 0.998 0.000

From resource on canvas: If this is a value great than 1, you can subtract 1 and multiply by 100 to get the % increase associated with a 1-unit increase in X1.

If this value is less than 1, you can subtract 1 and multiple by 100 to get the % decrease associated with a 1-unit increase in X1.

This means that exp(beta1) can be roughly interpreted as the percentage change in of Y with a one unit increase in X1.

Visualizing the fit of Cortisol

pred_df <- tibble(
  booklet_interval_mins = seq(min(df_q3$booklet_interval_mins, na.rm = TRUE),
                          max(df_q3$booklet_interval_mins, na.rm = TRUE),
                          length.out = 200)
) %>%
  mutate(
    wake_30 = pmin(booklet_interval_mins, 30),
    after_30 = pmax(booklet_interval_mins - 30, 0)
  )

pred_df$pred_log_cort <- predict(q3_cortisol.lmm, newdata = pred_df, re.form = NA)

# Back-transform to nmol/L
pred_df$pred_cort <- exp(pred_df$pred_log_cort)

ggplot(df_q3,
       aes(x = booklet_interval_mins,
           y = Cortisol..nmol.L.,
           group = SubjectID,
           color = factor(SubjectID))) +
  geom_line() +
  geom_line(
    data = pred_df,
    aes(x = booklet_interval_mins, y = pred_cort),
    inherit.aes = FALSE,
    linewidth = 1,
    color = "black"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  labs(
    x = "Minutes since waking (booklet)",
    y = "Cortisol (nmol/L)",
    title = "Cortisol Levels Over Time Since Waking (with Model Fit)"
  )
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).

Visualizing the fit of DHEA

pred_dhea <- tibble(
  booklet_interval_mins = seq(min(df_q3$booklet_interval_mins, na.rm = TRUE),
                              max(df_q3$booklet_interval_mins, na.rm = TRUE),
                              length.out = 200)
)

pred_dhea$pred_log_dhea <- predict(q3_dhea.lmm, newdata = pred_dhea, re.form = NA)

# Back-transform to nmol/L
pred_dhea$pred_dhea <- exp(pred_dhea$pred_log_dhea)

ggplot(df_q3,
       aes(x = booklet_interval_mins,
           y = DHEA..nmol.L.,
           group = SubjectID,
           color = factor(SubjectID))) +
  geom_line() +
  geom_line(
    data = pred_dhea,
    aes(x = booklet_interval_mins, y = pred_dhea),
    inherit.aes = FALSE,
    linewidth = 1,
    color = "black"
  ) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none") +
  labs(
    x = "Minutes since waking (booklet)",
    y = "DHEA (nmol/L)",
    title = "DHEA Levels Over Time Since Waking (With Model Fit)"
  )
## Warning: Removed 31 rows containing missing values or values outside the scale range
## (`geom_line()`).